Data Backfill

The goal of this doc is to execute an initial data pull of the hourly demand for California balancing authority subregion (CISO). This includes the following four independent system operators:

The data backfill process includes the following steps:

Load Libraries and Functions

library(dplyr)
library(EIAapi)
library(jsonlite)
library(gt)
library(plotly)
source("../pipeline/eia_data.R")
meta_json <- read_json(path = "../settings/settings.json")
s <- meta_json$series
series <- lapply(1:length(s), function(i) {
    subba_id <- NULL

    subba_id <- as.numeric(s[[i]]$subba_id)
    if (!is.na(as.numeric(s[[i]]$subba_id))) {
        subba_id <- as.numeric(s[[i]]$subba_id)
    } else {
        subba_id <- s[[i]]$subba_id
    }

    return(data.frame(
        parent_id = s[[i]]$parent_id,
        parent_name = s[[i]]$parent_name,
        subba_id = subba_id,
        subba_name = s[[i]]$subba_name
    ))
}) |>
    bind_rows()

api_path <- meta_json$api_path
meta_path <- meta_json$meta_path
data_path <- meta_json$data_path
facets_template <- list(
    parent = NULL,
    subba = NULL
)

start <- as.POSIXct(paste(
    paste(
        meta_json$start$year,
        meta_json$start$month,
        meta_json$start$day,
        sep = "-"
    ),
    " ",
    meta_json$start$hour,
    ":00:00",
    sep = ""
))


end <- as.POSIXct(paste(
    paste(
        meta_json$end$year,
        meta_json$end$month,
        meta_json$end$day,
        sep = "-"
    ),
    " ",
    meta_json$end$hour,
    ":00:00",
    sep = ""
))


# start <- as.POSIXct("2024-05-18 08:00:00")

# end <- as.POSIXct("2024-06-01 01:00:00")
attr(start, "tzone") <- "UTC"
attr(end, "tzone") <- "UTC"

offset <- 2200

eia_api_key <- Sys.getenv("EIA_API_KEY")
metadata <- eia_metadata(api_key = eia_api_key, api_path = api_path)
Warning: input string 'The api_key argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is not valid... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is not valid, must be a character object ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not pull the metadata... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not parse the metadata JSON... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is not valid... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is missing... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is not valid, must be a character object ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not pull the metadata... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not parse the metadata JSON... ❌
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
print(names(metadata))
 [1] "id"                "name"              "description"      
 [4] "frequency"         "facets"            "data"             
 [7] "startPeriod"       "endPeriod"         "defaultDateFormat"
[10] "defaultFrequency"  "command"          
print(metadata$startPeriod)
[1] "2018-06-19T05"
print(metadata$endPeriod)
[1] "2024-06-30T07"
meta <- NULL
data <- NULL
for (i in 1:nrow(series)) {
    facets <- facets_template
    facets["parent"] <- series[i, "parent_id"]
    facets["subba"] <- series[i, "subba_id"]
    print(facets)

    temp <- eia_backfill(
        start = start,
        end = end,
        offset = offset,
        api_key = eia_api_key,
        api_path = paste(api_path, "data", sep = ""),
        facets = facets
    )
    index <- seq.POSIXt(from = start, to = end, by = "hour")
    ts_obj <- data.frame(period = index) |>
        left_join(temp, by = c("period" = "time"))

    meta_temp <- create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
    meta_temp$index <- 1
    meta_df <- as.data.frame(meta_temp)

    meta <- rbind(meta, meta_df)
    data <- rbind(data, ts_obj)
}
$parent
[1] "CISO"

$subba
[1] "PGAE"

$parent
[1] "CISO"

$subba
[1] "SCE"

$parent
[1] "CISO"

$subba
[1] "SDGE"

$parent
[1] "CISO"

$subba
[1] "VEA"
print(meta)
  index parent subba                time               start
1     1   CISO  PGAE 2024-07-01 00:37:15 2018-07-01 08:00:00
2     1   CISO   SCE 2024-07-01 00:38:04 2018-07-01 08:00:00
3     1   CISO  SDGE 2024-07-01 00:39:03 2018-07-01 08:00:00
4     1   CISO   VEA 2024-07-01 00:39:58 2018-07-01 08:00:00
                  end           start_act    end_act start_match end_match
1 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28        TRUE     FALSE
2 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28        TRUE     FALSE
3 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28        TRUE     FALSE
4 2024-06-28 01:00:00 2018-07-01 08:00:00 2024-06-28        TRUE     FALSE
  n_obs  na     type update success
1 52545 546 backfill  FALSE   FALSE
2 52545 546 backfill  FALSE   FALSE
3 52545 546 backfill  FALSE   FALSE
4 52545 546 backfill  FALSE   FALSE
                                                                 comments
1 The end argument does not match the actual; Missing values were found; 
2 The end argument does not match the actual; Missing values were found; 
3 The end argument does not match the actual; Missing values were found; 
4 The end argument does not match the actual; Missing values were found; 
# The initial pull has some missing values
head(data)
               period subba               subba_name parent
1 2018-07-01 08:00:00  PGAE Pacific Gas and Electric   CISO
2 2018-07-01 09:00:00  PGAE Pacific Gas and Electric   CISO
3 2018-07-01 10:00:00  PGAE Pacific Gas and Electric   CISO
4 2018-07-01 11:00:00  PGAE Pacific Gas and Electric   CISO
5 2018-07-01 12:00:00  PGAE Pacific Gas and Electric   CISO
6 2018-07-01 13:00:00  PGAE Pacific Gas and Electric   CISO
                             parent_name value   value_units
1 California Independent System Operator 12522 megawatthours
2 California Independent System Operator 11745 megawatthours
3 California Independent System Operator 11200 megawatthours
4 California Independent System Operator 10822 megawatthours
5 California Independent System Operator 10644 megawatthours
6 California Independent System Operator 10559 megawatthours
# Save the data
d <- append_data(data_path = data_path, new_data = data, init = TRUE, save = TRUE)
[1] "Initial data pull"
[1] "Save the data to CSV file"
# Save the metadata
meta["success"] <- TRUE
meta["update"] <- TRUE
m <- append_metadata(meta_path = meta_path, new_meta = meta, init = TRUE, save = TRUE)
[1] "Saving the metadata file"

Plot the Series

We will use Plotly to visualize the series:

d <- data |> arrange(subba, period)

d$subba <- as.character(d$subba)

p <- plot_ly(d, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")

p